Goal

Goal of this project is to find characteristics of texts from 3 popular horror authors, identify similarities and differences in their texts in the spooky dataset. Data consists of excerpts of texts written by Edgar Allan Poe (EAP), HP Lovecraft (HPL), and Mary Wollstonecraft Shelley (MWS).

Load the packages and read the data

Setup the libraries if not already installed

packages.used <- c("ggplot2", "plotrix", "waffle", "dplyr", "tibble", "tidyr",  "stringr", "tidytext", "topicmodels", "wordcloud", "plotly", "webshot", "htmlwidgets", "reshape2")

# check packages that need to be installed.
packages.needed <- setdiff(packages.used, intersect(installed.packages()[,1], packages.used))

# install additional packages
if(length(packages.needed) > 0) {
  install.packages(packages.needed, dependencies = TRUE, repos = 'http://cran.us.r-project.org')
}


library(ggplot2)
library(dplyr)
library(tibble)
library(tidyr)
library(stringr)
library(tidytext)
library(topicmodels)
library(wordcloud)
library(plotrix)
library(waffle)
library(plotly)
library(webshot)
library(htmlwidgets)
library(reshape2)

Read in the data

spooky.csv in data folder, and this Rmd inside doc folder.

spooky <- read.csv('../data/spooky.csv', as.is = TRUE)

Overview of the dataset

Take a look of first few rows and dimension of the dataset

head(spooky, 3)
##        id
## 1 id26305
## 2 id17569
## 3 id11008
##                                                                                                                                                                                                                                      text
## 1 This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.
## 2                                                                                                                                                                 It never once occurred to me that the fumbling might be a mere mistake.
## 3                                In his left hand was a gold snuff box, from which, as he capered down the hill, cutting all manner of fantastic steps, he took snuff incessantly with an air of the greatest possible self satisfaction.
##   author
## 1    EAP
## 2    HPL
## 3    EAP
dim(spooky)
## [1] 19579     3

Quick scan on if any missing value, variable typle

#Check to see any missing value in the dataset
sum(is.na(spooky))
## [1] 0
#change `author` type from `character` to `factor` for analysis
class(spooky$author)
## [1] "character"
spooky$author <- as.factor(spooky$author)
class(spooky$author)
## [1] "factor"

How many texts do each author have in the dataset?

num_texts <- table(spooky$author)
num_texts
## 
##  EAP  HPL  MWS 
## 7900 5635 6044

Plot composition of number of texts from 3 authors in pie chart, display counts and percentages

lbls <- paste(names(num_texts), '\n', num_texts, '\n', round(num_texts/sum(num_texts) * 100, 1), '%', sep = '')

pie3D(num_texts, labels = lbls, explode = 0.05, labelcex = 0.8)

Writing Style

Do some authors use more questions in the texts than others?

  • Count number of question marks in texts for spooky
  • Add a field num_qns for the counts
  • Wrangle data to show counts for each author
  • Plot a waffle chart to see comparison of use of questions in texts among 3 authors.
#count number of questions in texts
str_count(spooky, '\\?')
## [1]    0 1098    0
#add a field num_qns to spooky
dat1 <- mutate(spooky, num_qns = str_count(spooky$text, '\\?'))

#aggregate number of questions by authors
dat2 <- aggregate(dat1$num_qns, by = list(Author = dat1$author), FUN = sum)

#rename column name
colnames(dat2)[which(names(dat2) == 'x')] <- 'num_qns'
dat2
##   Author num_qns
## 1    EAP     510
## 2    HPL     169
## 3    MWS     419
waffle(c('EAP' = dat2[1, 2], 'HPL' = dat2[2, 2], 'MWS' = dat2[3, 2]), rows = 20, size = 0.5, title = 'Count of Questions in Texts by Authors', xlab = '(1 square == 1 question)')

Sentiment analysis

Positive and negative emotional content comparison in authors’ text

Apply sentiment analysis using bing lexicon, since the 3 widely-used lexicons stay more concurrent with modern language usage, I am interested in exploring the emotional content of the authors (1800s to early 1900s) in general.

get_sentiments("bing")
## # A tibble: 6,788 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faced     negative 
##  2 2-faces     negative 
##  3 a+          positive 
##  4 abnormal    negative 
##  5 abolish     negative 
##  6 abominable  negative 
##  7 abominably  negative 
##  8 abominate   negative 
##  9 abomination negative 
## 10 abort       negative 
## # ... with 6,778 more rows
tidy_text <- unnest_tokens(spooky, word, text)
tidy_text_sentiment <- tidy_text %>% inner_join(get_sentiments('bing'))
## Joining, by = "word"
head(tidy_text_sentiment, 10)
##         id author        word sentiment
## 1  id26305    EAP     dungeon  negative
## 2  id26305    EAP   perfectly  positive
## 3  id17569    HPL     mistake  negative
## 4  id11008    EAP        gold  positive
## 5  id11008    EAP   fantastic  positive
## 6  id11008    EAP incessantly  negative
## 7  id11008    EAP    greatest  positive
## 8  id27763    MWS      lovely  positive
## 9  id27763    MWS     fertile  positive
## 10 id27763    MWS       happy  positive
dat3 <- table(tidy_text_sentiment$sentiment, tidy_text_sentiment$author)

pyramid.plot(dat3[1,c(1:3)], dat3[2,c(1:3)], top.labels = NULL, show.values = TRUE, ndig = 0, main = 'Author by Sentiments', unit = c('Negative', 'Positive'), ppmar = c(4, 4, 4, 4), laxlab = FALSE, raxlab = FALSE)
## [1] 5.1 4.1 4.1 2.1
legend('topright', legend = c("EAP", "HPL", "MWS"), col = c("red", "green", "blue"), lty = 1, bty = 'n', lwd = 8, cex = 0.8, horiz=TRUE)

What are the top 100 positive and negative words do the authors use?

#comparison cloud
tidy_text %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("darkgreen", "purple"),
                   max.words = 100)
## Joining, by = "word"

  • Aggregate number of negative and positive words used by each author
  • Plot a bubble chart to show sentiments and num_qns by authors
dat4 <- as.data.frame.matrix(table(tidy_text_sentiment$author, tidy_text_sentiment$sentiment))
dat4 <- setNames(cbind(rownames(dat4), dat4, row.names = NULL), c('Author', 'negative', 'positive'))


#join tables by author
dat5 <- inner_join(dat2, dat4, by = 'Author')
dat5
##   Author num_qns negative positive
## 1    EAP     510     7203     6144
## 2    HPL     169     7605     3731
## 3    MWS     419     8150     6799
#   Author num_qns negative positive
# 1    EAP     510     7203     6144
# 2    HPL     169     7605     3731
# 3    MWS     419     8150     6799


#plot bubble chart: Sentiments and Num of Questions per Author
p <- plot_ly(dat5, x = ~positive, y = ~negative, size = ~num_qns, color = ~Author, 
    type = 'scatter', mode = 'markers', marker = list(opacity = 0.5)) %>% 
    layout(title = '<b>Sentiments and Num of Questions per Author</b>',
           xaxis = list(title = '<b>Positive words</b>', showgrid = FALSE),
           yaxis = list(title = '<b>Negative words</b>', showgrid = FALSE),
           showlegend = FALSE) %>% 
    add_annotations(
            text = paste(dat5$Author, '\n', dat5$num_qns),
            xref = "x",
            yref = "y",
            showarrow = TRUE,
            arrowsize = 0.5,
            ax = 40,
            ay = -60)

p
#export(p, file = '/Users/qinqingao/Documents/GitHub/spring2018-project1-ginnyqg/figs/Bubble.png')
  • With the same method, explore relationship between authors’ use of questions in sentences and toward their total volume of texts.
#sentence length
spooky$sen_length <- str_length(spooky$text)

dat6 <- mutate(spooky, sen_length = spooky$sen_length)
dat7 <- aggregate(dat6$sen_length, by = list(Author = dat6$author), FUN = sum)

#rename column name
colnames(dat7)[which(names(dat7) == 'x')] <- 'sen_length'

#join tables by author
dat8 <- inner_join(dat7, dat5, by = 'Author')

new_num_texts <- melt(num_texts)
colnames(new_num_texts) <- c('Author', 'num_text')

dat9 <- inner_join(new_num_texts, dat8, by = 'Author')
dat9
##   Author num_text sen_length num_qns negative positive
## 1    EAP     7900    1123585     510     7203     6144
## 2    HPL     5635     878178     169     7605     3731
## 3    MWS     6044     916632     419     8150     6799
#plot bubble chart: Num of Texts, Questions, Sentence Length per Author
p2 <- plot_ly(dat9, x = ~num_text, y = ~num_qns, size = ~sen_length, color = ~Author, 
    type = 'scatter', mode = 'markers', marker = list(opacity = 0.5)) %>% 
    layout(title = '<b>Num of Texts, Questions, Sentence Length per Author</b>',
           xaxis = list(title = '<b>Num of Texts</b>', showgrid = FALSE),
           yaxis = list(title = '<b>Num of Questions</b>', showgrid = FALSE),
           showlegend = FALSE) %>% 
    add_annotations(
            text = paste(dat9$Author, '\n', prettyNum(dat9$sen_length, big.mark = ',', scientific = FALSE)),
            xref = "x",
            yref = "y",
            showarrow = TRUE,
            arrowsize = 0.5,
            ax = 40,
            ay = -60)


p2
#export(p2, file = '/Users/qinqingao/Documents/GitHub/spring2018-project1-ginnyqg/figs/Bubble_num_text_qns_sent.png')